{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2006, 2)"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "with open('C:/Users/win8/PycharmProjects/textmining/venv/1111stop.txt', encoding='utf8') as file:\n",
    "    stopWord_list = [k.strip() for k in file.readlines()]\n",
    "\n",
    "train_df = pd.read_csv('C:/Users/win8/PycharmProjects/textmining/venv/train.txt',sep='\\t', header=None,)\n",
    "train_df = train_df.astype(str)\n",
    "train_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache C:\\Users\\win8\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.213 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "前1000篇文章分词共花费42.50秒\n",
      "前2000篇文章分词共花费71.21秒\n"
     ]
    }
   ],
   "source": [
    "import jieba\n",
    "import time\n",
    "\n",
    "train_df.columns = ['分类', '文章']\n",
    "stopword_list = [k.strip() for k in open('C:/Users/win8/Desktop/1111stop.txt', encoding='utf8').readlines() if k.strip() != '']\n",
    "cutWords_list = []\n",
    "i = 0\n",
    "startTime = time.time()\n",
    "for article in train_df['文章']:\n",
    "    cutWords = [k for k in jieba.cut(article) if k not in stopword_list]\n",
    "    i += 1\n",
    "    if i % 1000 == 0:\n",
    "        print('前%d篇文章分词共花费%.2f秒' %(i, time.time()-startTime))\n",
    "    cutWords_list.append(cutWords)\n",
    "with open('C:/Users/win8/PycharmProjects/textmining/venv/cutWords_list.txt', 'w',encoding='utf-8') as file: \n",
    "    for cutWords in cutWords_list:\n",
    "        file.write(' '.join(cutWords) + '\\n')\n",
    "with open('C:/Users/win8/PycharmProjects/textmining/venv/cutWords_list.txt',encoding='utf-8') as file:\n",
    "    cutWords_list = [k.split() for k in file.readlines()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2006, 107516)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "tf1 = TfidfVectorizer(cutWords_list)\n",
    "X = tf1.fit_transform(train_df['文章'])\n",
    "print(X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2006, 11578)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "tfidf = TfidfVectorizer(cutWords_list, min_df=10, max_df=0.6)\n",
    "X = tfidf.fit_transform(train_df['文章'])\n",
    "print(X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2006,)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "import pandas as pd\n",
    "\n",
    "train_df = pd.read_csv('C:/Users/win8/PycharmProjects/textmining/venv/train.txt', sep='\\t', header=None)\n",
    "train_df = train_df.astype(str)\n",
    "labelEncoder = LabelEncoder()\n",
    "y = labelEncoder.fit_transform(train_df[0].values)\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "互联网    252\n",
       "娱乐     252\n",
       "财经     251\n",
       "零售     251\n",
       "体育     250\n",
       "资讯     250\n",
       "电商     250\n",
       "军事     250\n",
       "Name: 0, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df[0].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time  \n",
    "from sklearn import metrics  \n",
    "import pickle as pickle  \n",
    "import pandas as pd\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2006, 500)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#降维\n",
    "from sklearn.feature_selection import SelectKBest,chi2\n",
    "X=SelectKBest(chi2,k=500).fit_transform(X,y)\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1604, 500)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#划分训练测试集\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2)\n",
    "train_x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=3,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=9, n_jobs=None,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "model = RandomForestClassifier(n_estimators=9,min_samples_split=3)\n",
    "model.fit(train_x, train_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision: 98.04%, recall: 97.95%\n",
      "accuracy: 98.01%\n"
     ]
    }
   ],
   "source": [
    "#模型评估模块\n",
    "predict = model.predict(test_x)\n",
    "precision = metrics.precision_score(test_y, predict, average='macro')\n",
    "recall = metrics.recall_score(test_y, predict, average='macro')\n",
    "print('precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall))\n",
    "accuracy = metrics.accuracy_score(test_y, predict)\n",
    "print('accuracy: %.2f%%' % (100 * accuracy))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>互联网</th>\n",
       "      <th>体育</th>\n",
       "      <th>军事</th>\n",
       "      <th>娱乐</th>\n",
       "      <th>电商</th>\n",
       "      <th>财经</th>\n",
       "      <th>资讯</th>\n",
       "      <th>零售</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>互联网</th>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>体育</th>\n",
       "      <td>0</td>\n",
       "      <td>63</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>军事</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>46</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>娱乐</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>49</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>电商</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>55</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>财经</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>40</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>资讯</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>47</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>零售</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     互联网  体育  军事  娱乐  电商  财经  资讯  零售\n",
       "互联网   51   0   0   0   0   0   0   0\n",
       "体育     0  63   0   1   0   0   0   0\n",
       "军事     0   0  46   0   0   0   0   0\n",
       "娱乐     0   0   0  49   0   0   0   0\n",
       "电商     1   0   0   0  55   0   0   0\n",
       "财经     1   0   0   1   2  40   0   2\n",
       "资讯     0   0   0   0   0   0  47   0\n",
       "零售     0   0   0   0   0   0   0  43"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#绘制混淆矩阵\n",
    "from sklearn.metrics import confusion_matrix\n",
    "pd.DataFrame(confusion_matrix(test_y, predict), \n",
    "             columns=labelEncoder.classes_, \n",
    "             index=labelEncoder.classes_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存模型\n",
    "import pickle\n",
    "with open('rfclassifier.model', 'wb') as file:\n",
    "    save = {\n",
    "        'labelEncoder' : labelEncoder,\n",
    "        'tfidfVectorizer' : tfidf,\n",
    "        'logistic_model' : model\n",
    "    }\n",
    "    pickle.dump(save, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 295,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The parameters of the best model are: \n",
      "{'C': 4, 'max_iter': 50}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\svm\\base.py:244: ConvergenceWarning: Solver terminated early (max_iter=50).  Consider pre-processing your data with StandardScaler or MinMaxScaler.\n",
      "  % self.max_iter, ConvergenceWarning)\n",
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\utils\\deprecation.py:125: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n",
      "  warnings.warn(*warn_args, **warn_kwargs)\n",
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\utils\\deprecation.py:125: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n",
      "  warnings.warn(*warn_args, **warn_kwargs)\n",
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\utils\\deprecation.py:125: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n",
      "  warnings.warn(*warn_args, **warn_kwargs)\n",
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\utils\\deprecation.py:125: FutureWarning: You are accessing a training score ('split3_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n",
      "  warnings.warn(*warn_args, **warn_kwargs)\n",
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\utils\\deprecation.py:125: FutureWarning: You are accessing a training score ('split4_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n",
      "  warnings.warn(*warn_args, **warn_kwargs)\n",
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\utils\\deprecation.py:125: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n",
      "  warnings.warn(*warn_args, **warn_kwargs)\n",
      "C:\\Users\\win8\\PycharmProjects\\weixin\\venv\\lib\\site-packages\\sklearn\\utils\\deprecation.py:125: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n",
      "  warnings.warn(*warn_args, **warn_kwargs)\n"
     ]
    }
   ],
   "source": [
    "#交叉验证\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.svm import SVC\n",
    "parameters = { 'C':[1, 2, 4], 'max_iter':[10,50,100]}\n",
    "svc = SVC(gamma='auto',probability=True,kernel='rbf')\n",
    "model = GridSearchCV(svc, parameters, n_jobs=-1,cv=5)\n",
    "model.fit(train_x, train_y)\n",
    "cv_result = pd.DataFrame.from_dict(model.cv_results_)   \n",
    "print('The parameters of the best model are: ')\n",
    "print(model.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
